/* File: add_extra_datasets.do
 * Author: Luca Maini, Josh Feng
 * Purpose: Add additional data sources to the main dataset
 *
 * Date Created: 4/26/2022

*/

*** DATASET 1: drugs in an ATC-3 code where a small-overlap deal occurs

* Get a list of drug-years involved in small-overlap deals
use "${maindir}\combined_regression_dataset_with_valeant.dta", clear

foreach var in stlth_same_atc3_acqd_count stlth_same_atc3_acqr_count {
	bysort Product (year) : gen `var'_s = `var' != `var'[_n-1] if _n != 1
	bysort Product (year) : replace `var'_s = `var' if _n == 1
	}

keep if stlth_same_atc3_acqd_count_s == 1 | stlth_same_atc3_acqr_count_s == 1
keep Product year mktCompany stlth_same_atc3_acqd_count_s ///
	stlth_same_atc3_acqr_count_s

* merge with info on ATC-3 to identify the ones where the consolidation happens
joinby Product using "${inputdir}\ssr_product_atc4_codes.dta", ///
	unmatched(master)
drop _merge

replace atc3 = "" if atc3 == "NA"
replace atc4 = "" if atc4 == "NA"

* At this point the code diverges. We preserve the data and create the datasets
* we need one by one
preserve

*** DATASET 1: drugs in an ATC-3 code where a small-overlap deal occurs
drop atc4
duplicates drop

* identify ATC-3 codes where there was overlap
bysort mktCompany year atc3 : egen has_acqd = max(stlth_same_atc3_acqd_count_s & atc3 != "")
bysort mktCompany year atc3 : egen has_acqr = max(stlth_same_atc3_acqr_count_s & atc3 != "")

* drop ATC-3 codes without overlap (these are codes that belong to only one of 
* the two drugs that create the overlap)
keep if has_acqd == 1 & has_acqr == 1
drop has_acqd has_acqr stlth_same_atc3_acqd_count_s ///
	stlth_same_atc3_acqr_count_s mktCompany

compress

* reshape to get a list of affected ATC-3 codes and years, with a list of 
* associated products
rename Product overlap_Product
bysort atc3 year : gen n = _n
reshape wide overlap_Product, i(atc3 year) j(n)

save "${outdir}\small_overlap_deal_atc3_codes.dta", replace

* Now join by ATC3 to find other drugs in the affected atc3 markets
use "${inputdir}\ssr_product_atc4_codes.dta", clear
drop atc4
drop if atc3 == "NA"
duplicates drop

joinby atc3 using "${outdir}\small_overlap_deal_atc3_codes.dta"
foreach var of varlist overlap_Product* {
	drop if Product	== `var' // these are directly affected
	drop `var'
	}

keep Product year	// keep only list of Product-years
duplicates drop

save "${outdir}\drugs_in_stlth_overlap_atc3.dta", replace


*** DATASET 2: drugs sharing the same ATC-4 code as a drug involved in a small
***			   overlap deal.
restore

drop atc3 stlth_same_atc3_acqd_count_s stlth_same_atc3_acqr_count_s mktCompany

* reshape to get a list of affected ATC-4 codes and years, with a list of 
* associated products
rename Product overlap_Product
bysort atc4 year : gen n = _n
reshape wide overlap_Product, i(atc4 year) j(n)

* merge on atc-4 with all other drugs
joinby atc4 using "${inputdir}\ssr_product_atc4_codes.dta"

foreach var of varlist overlap_Product* {
	drop if Product	== `var' // these are directly affected
	drop `var'
	}

keep Product year	// keep only list of Product-years
duplicates drop

save "${outdir}\drugs_in_stlth_overlap_atc4.dta", replace



/*
Merge the data sources
*/


// 0) Start with the combined dataset created in make_main_data.do

foreach dataset in combined_regression_dataset combined_regression_dataset_with_valeant {

	use "${maindir}\\`dataset'.dta", clear

	* Merge in indicators for spillover effects
	merge 1:1 Product year using "${outdir}\drugs_in_stlth_overlap_atc3.dta", ///
		keep(master match) gen(stlth_same_atc3_spill)
	* transform merge variable in indicator
	replace stlth_same_atc3_spill = stlth_same_atc3_spill == 3
	label var stlth_same_atc3_spill "Same ATC-3 as stealth overlap event"
	label drop _merge
	
	merge 1:1 Product year using "${outdir}\drugs_in_stlth_overlap_atc4.dta", ///
		keep(master match) gen(stlth_same_atc3_comp)
	* transform merge variable in indicator
	replace stlth_same_atc3_comp = stlth_same_atc3_comp == 3
	label var stlth_same_atc3_comp "Same ATC-4 as stealth overlap drug"
	label drop _merge
	
	* Transform to count variables
	foreach var in stlth_same_atc3_spill stlth_same_atc3_comp {
		bysort Product (year) : gen `var'_count = sum(`var')
		local lbl : variable label `var'
		label var `var'_count "`lbl' - COUNT"

		}

	compress
	save "${maindir}\\`dataset'_augmented.dta", replace
	
	}
